In [1]:
import pandas as pd
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
In [2]:
data = pd.read_csv("C:/Users/abhishek/OneDrive/Desktop/dt/dataset.csv")
data.head()
Out[2]:
Marital status Application mode Application order Course Daytime/evening attendance Previous qualification Nacionality Mother's qualification Father's qualification Mother's occupation ... Curricular units 2nd sem (credited) Curricular units 2nd sem (enrolled) Curricular units 2nd sem (evaluations) Curricular units 2nd sem (approved) Curricular units 2nd sem (grade) Curricular units 2nd sem (without evaluations) Unemployment rate Inflation rate GDP Target
0 1 6 1 11 1 1 1 1 3 4 ... 0 6 6 6 13.666667 0 13.9 -0.3 0.79 Enrolled
1 1 8 2 15 1 1 1 23 27 6 ... 0 6 10 5 12.400000 0 9.4 -0.8 -3.12 Enrolled
2 2 12 1 3 0 1 1 22 28 10 ... 0 6 6 6 13.000000 0 13.9 -0.3 0.79 Enrolled
3 2 12 1 17 0 12 1 22 27 10 ... 0 5 17 5 11.500000 5 16.2 0.3 -0.92 Enrolled
4 1 1 1 12 1 1 1 13 28 8 ... 0 8 8 8 14.345000 0 15.5 2.8 -4.06 Enrolled

5 rows × 35 columns

In [3]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4424 entries, 0 to 4423
Data columns (total 35 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Marital status                                  4424 non-null   int64  
 1   Application mode                                4424 non-null   int64  
 2   Application order                               4424 non-null   int64  
 3   Course                                          4424 non-null   int64  
 4   Daytime/evening attendance                      4424 non-null   int64  
 5   Previous qualification                          4424 non-null   int64  
 6   Nacionality                                     4424 non-null   int64  
 7   Mother's qualification                          4424 non-null   int64  
 8   Father's qualification                          4424 non-null   int64  
 9   Mother's occupation                             4424 non-null   int64  
 10  Father's occupation                             4424 non-null   int64  
 11  Displaced                                       4424 non-null   int64  
 12  Educational special needs                       4424 non-null   int64  
 13  Debtor                                          4424 non-null   int64  
 14  Tuition fees up to date                         4424 non-null   int64  
 15  Gender                                          4424 non-null   int64  
 16  Scholarship holder                              4424 non-null   int64  
 17  Age at enrollment                               4424 non-null   int64  
 18  International                                   4424 non-null   int64  
 19  Curricular units 1st sem (credited)             4424 non-null   int64  
 20  Curricular units 1st sem (enrolled)             4424 non-null   int64  
 21  Curricular units 1st sem (evaluations)          4424 non-null   int64  
 22  Curricular units 1st sem (approved)             4424 non-null   int64  
 23  Curricular units 1st sem (grade)                4424 non-null   float64
 24  Curricular units 1st sem (without evaluations)  4424 non-null   int64  
 25  Curricular units 2nd sem (credited)             4424 non-null   int64  
 26  Curricular units 2nd sem (enrolled)             4424 non-null   int64  
 27  Curricular units 2nd sem (evaluations)          4424 non-null   int64  
 28  Curricular units 2nd sem (approved)             4424 non-null   int64  
 29  Curricular units 2nd sem (grade)                4424 non-null   float64
 30  Curricular units 2nd sem (without evaluations)  4424 non-null   int64  
 31  Unemployment rate                               4424 non-null   float64
 32  Inflation rate                                  4424 non-null   float64
 33  GDP                                             4424 non-null   float64
 34  Target                                          4424 non-null   object 
dtypes: float64(5), int64(29), object(1)
memory usage: 1.2+ MB
In [4]:
print(data["Target"].unique())
['Enrolled' 'Graduate' 'Dropout']
In [5]:
data['Target'] = data['Target'].map({
  'Dropout':0,
  'Enrolled':1,
  'Graduate':2
})
print(data["Target"].unique())
[1 2 0]
In [6]:
data.corr()['Target']
Out[6]:
Marital status                                    0.074310
Application mode                                  0.110086
Application order                                -0.027393
Course                                            0.009877
Daytime/evening attendance                       -0.066439
Previous qualification                            0.068021
Nacionality                                      -0.009248
Mother's qualification                            0.071100
Father's qualification                            0.033291
Mother's occupation                              -0.077593
Father's occupation                              -0.094916
Displaced                                        -0.070649
Educational special needs                        -0.003751
Debtor                                            0.154802
Tuition fees up to date                          -0.342121
Gender                                            0.118454
Scholarship holder                               -0.114517
Age at enrollment                                 0.201806
International                                    -0.015893
Curricular units 1st sem (credited)               0.002464
Curricular units 1st sem (enrolled)              -0.052020
Curricular units 1st sem (evaluations)           -0.125278
Curricular units 1st sem (approved)              -0.290243
Curricular units 1st sem (grade)                 -0.349652
Curricular units 1st sem (without evaluations)    0.021565
Curricular units 2nd sem (credited)               0.002427
Curricular units 2nd sem (enrolled)              -0.060670
Curricular units 2nd sem (evaluations)           -0.194412
Curricular units 2nd sem (approved)              -0.351135
Curricular units 2nd sem (grade)                 -0.429214
Curricular units 2nd sem (without evaluations)    0.040991
Unemployment rate                                 0.037279
Inflation rate                                    0.021798
GDP                                              -0.037052
Target                                            1.000000
Name: Target, dtype: float64
In [7]:
#correlation heatmap
plt.figure(figsize=(30, 30))
sns.heatmap(data.corr() , annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()
In [19]:
#lowest features
correlations = data.corr()['Target']
top_10_features = correlations.abs().nsmallest(10).index
top_10_corr_values = correlations[top_10_features]
for i in range(10):
    print(top_10_features[i],'-',top_10_corr_values[i])
Curricular units 2nd sem (credited) - 0.0024272668160775935
Curricular units 1st sem (credited) - 0.0024639396702293673
Educational special needs - -0.0037511261997920774
Nacionality - -0.009247826022182699
Course - 0.009877475210117816
International - -0.015893341271952886
Curricular units 1st sem (without evaluations) - 0.021564930610866943
Inflation rate - 0.021797568994223828
Application order - -0.027392621567232125
Father's qualification - 0.0332911238103344
In [20]:
#barplot for lowest features
plt.figure(figsize=(10, 11))
plt.bar(top_10_features, top_10_corr_values)
plt.xlabel('Features')
plt.ylabel('Correlation with Target')
plt.title('Top 10 Features with Lowest correlation Correlation to Target')
plt.xticks(rotation=45)
plt.show()
In [10]:
#dropping few columns in our model because they are insignificant
new_data = data.copy()
new_data = new_data.drop(columns=['Nacionality', 
                 'Mother\'s qualification', 
                 'Father\'s qualification', 
                 'Educational special needs', 
                 'International', 
                 'Curricular units 1st sem (without evaluations)',
                 'Unemployment rate', 
                 'Inflation rate'], axis=1)
new_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4424 entries, 0 to 4423
Data columns (total 27 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Marital status                                  4424 non-null   int64  
 1   Application mode                                4424 non-null   int64  
 2   Application order                               4424 non-null   int64  
 3   Course                                          4424 non-null   int64  
 4   Daytime/evening attendance                      4424 non-null   int64  
 5   Previous qualification                          4424 non-null   int64  
 6   Mother's occupation                             4424 non-null   int64  
 7   Father's occupation                             4424 non-null   int64  
 8   Displaced                                       4424 non-null   int64  
 9   Debtor                                          4424 non-null   int64  
 10  Tuition fees up to date                         4424 non-null   int64  
 11  Gender                                          4424 non-null   int64  
 12  Scholarship holder                              4424 non-null   int64  
 13  Age at enrollment                               4424 non-null   int64  
 14  Curricular units 1st sem (credited)             4424 non-null   int64  
 15  Curricular units 1st sem (enrolled)             4424 non-null   int64  
 16  Curricular units 1st sem (evaluations)          4424 non-null   int64  
 17  Curricular units 1st sem (approved)             4424 non-null   int64  
 18  Curricular units 1st sem (grade)                4424 non-null   float64
 19  Curricular units 2nd sem (credited)             4424 non-null   int64  
 20  Curricular units 2nd sem (enrolled)             4424 non-null   int64  
 21  Curricular units 2nd sem (evaluations)          4424 non-null   int64  
 22  Curricular units 2nd sem (approved)             4424 non-null   int64  
 23  Curricular units 2nd sem (grade)                4424 non-null   float64
 24  Curricular units 2nd sem (without evaluations)  4424 non-null   int64  
 25  GDP                                             4424 non-null   float64
 26  Target                                          4424 non-null   int64  
dtypes: float64(3), int64(24)
memory usage: 933.3 KB
In [11]:
new_data['Target'].value_counts()
Out[11]:
Target
1    2209
2    1421
0     794
Name: count, dtype: int64
In [12]:
x = new_data['Target'].value_counts().index
y = new_data['Target'].value_counts().values

df = pd.DataFrame({
  'Target': x,
  'Count_T' : y
})
In [13]:
fig = px.pie(df,
             names='Target',
             values='Count_T',
             title='How many dropouts, enrolled & graduates are there in Target column')

fig.update_traces(labels=['Enrolled', 'Graduated', 'Dropout'], hole=0.4, textinfo='value+label', pull=[0, 0.2, 0.1])
fig.show()
C:\Users\abhishek\anaconda3_1\Lib\site-packages\numpy\core\numeric.py:2468: FutureWarning:

elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison

In [14]:
#top 10 highest features
correlations = data.corr()['Target']
top_10_features = correlations.abs().nlargest(10).index
top_10_corr_values = correlations[top_10_features]
for i in range(10):
    print(top_10_features[i],'-',top_10_corr_values[i])
Target - 1.0
Curricular units 2nd sem (grade) - -0.4292142441814479
Curricular units 2nd sem (approved) - -0.35113529228045165
Curricular units 1st sem (grade) - -0.34965164276834676
Tuition fees up to date - -0.342120547448968
Curricular units 1st sem (approved) - -0.29024327759856267
Age at enrollment - 0.20180618945687442
Curricular units 2nd sem (evaluations) - -0.1944116159981643
Debtor - 0.15480160305546845
Curricular units 1st sem (evaluations) - -0.1252778058589179
In [15]:
#bar plot for that
plt.figure(figsize=(10, 11))
plt.bar(top_10_features, top_10_corr_values)
plt.xlabel('Features')
plt.ylabel('Correlation with Target')
plt.title('Top 10 Features with Highest Correlation to Target')
plt.xticks(rotation=45)
plt.show()
In [16]:
px.histogram(new_data['Age at enrollment'], x='Age at enrollment',color_discrete_sequence=['lightblue'])
In [17]:
sns.boxplot(x='Target', y='Age at enrollment', data=new_data)
plt.xlabel('Target')
plt.ylabel('Age')
plt.title('Relationship between Age and Target')
plt.show()
In [18]:
# packages needed for models
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn import svm


from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import VotingClassifier
In [19]:
#training and testing the models
X = new_data.drop('Target', axis=1)
y = new_data['Target']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)
X_test_array = X_test.values if isinstance(X_test, pd.DataFrame) else X_test
In [20]:
#introduction of various models
dtree = DecisionTreeClassifier(random_state=0)
rfc = RandomForestClassifier(random_state=2)
lr = LogisticRegression(random_state=42)
knn = KNeighborsClassifier(n_neighbors=3)
abc = AdaBoostClassifier(n_estimators=50,learning_rate=1, random_state=0)
xbc = XGBClassifier(tree_method='hist')
svm = svm.SVC(kernel='linear',probability=True)
In [21]:
#testing of various models
dtree.fit(X_train,y_train)
rfc.fit(X_train,y_train)
lr.fit(X_train,y_train)
knn.fit(X_train,y_train)
abc.fit(X_train, y_train)
xbc.fit(X_train, y_train)
svm.fit(X_train, y_train)
C:\Users\abhishek\anaconda3_1\Lib\site-packages\sklearn\linear_model\_logistic.py:460: ConvergenceWarning:

lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression

Out[21]:
SVC(kernel='linear', probability=True)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVC(kernel='linear', probability=True)
In [22]:
#decision tree accuracy
y_pred = dtree.predict(X_test)
print("Accuracy :",round(accuracy_score(y_test,y_pred)*100,2),"%")
Accuracy : 66.55 %
In [23]:
#random forest accuracy
y_pred = rfc.predict(X_test)
print("Accuracy :",round(accuracy_score(y_test,y_pred)*100,2),"%")
Accuracy : 77.4 %
In [24]:
#logistic regression accuracy
y_pred = lr.predict(X_test)
print("Accuracy :",round(accuracy_score(y_test,y_pred)*100,2),"%")
Accuracy : 77.06 %
In [25]:
#kneighbors accuracy
y_pred = knn.predict(X_test_array)
print("Accuracy :",round(accuracy_score(y_test,y_pred)*100,2),"%")
C:\Users\abhishek\anaconda3_1\Lib\site-packages\sklearn\base.py:464: UserWarning:

X does not have valid feature names, but KNeighborsClassifier was fitted with feature names

Accuracy : 69.6 %
In [26]:
#adaboost
y_pred = abc.predict(X_test)
print("Accuracy :",round(accuracy_score(y_test,y_pred)*100,2),"%")
Accuracy : 74.92 %
In [27]:
#xgb
y_pred = xbc.predict(X_test)
print("Accuracy :",round(accuracy_score(y_test,y_pred)*100,2),"%")
Accuracy : 76.84 %
In [28]:
#svm
y_pred = svm.predict(X_test)
print("Accuracy :",round(accuracy_score(y_test,y_pred)*100,2),"%")
Accuracy : 76.95 %
In [29]:
#soft voting
ens1 = VotingClassifier(estimators=[('rfc', rfc), ('lr', lr), ('abc',abc), ('xbc',xbc)], voting='soft')
ens1.fit(X_train, y_train)

y_pred = ens1.predict(X_test)
print("Accuracy :",round(accuracy_score(y_test,y_pred)*100,2),"%")
C:\Users\abhishek\anaconda3_1\Lib\site-packages\sklearn\linear_model\_logistic.py:460: ConvergenceWarning:

lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression

Accuracy : 79.32 %
In [30]:
#hard voting
ens2 = VotingClassifier(estimators=[('rfc', rfc), ('lr', lr), ('abc',abc), ('xbc',xbc)], voting='hard')
ens2.fit(X_train, y_train)

y_pred = ens2.predict(X_test)
print("Accuracy :",round(accuracy_score(y_test,y_pred)*100,2),"%")
C:\Users\abhishek\anaconda3_1\Lib\site-packages\sklearn\linear_model\_logistic.py:460: ConvergenceWarning:

lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression

Accuracy : 77.4 %
In [ ]: